# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
from dbbooks.items import DbbooksItem
import re
import time

class BookbotSpider(scrapy.Spider):
    name = 'bookbot'
    allowed_domains = ['book.douban.com']
    start_urls = ['http://book.douban.com/tag']
    keys = ['math','algorithm','machinelearning','人工智能']

    def parse(self, response):
        tags = response.css('table.tagCol tbody tr td a::text').extract()
        # 中文匹配tag，有时非常不精确
        url = 'https://book.douban.com/tag/' + str(BookbotSpider.keys[1])
        yield Request(url=url,callback=self.parse_page_num)

    def parse_page_num(self,response):
        page_list = response.css('div.paginator>a::text').extract()
        total_page = max(list(map(lambda i : int(i), page_list)))
        print(response.url, 'total page:',total_page)
        for i in range(0,total_page):
            time.sleep(0.5)
            url = 'https://book.douban.com/tag/' + str(BookbotSpider.keys[1]) + '?start=' + str(20*i)
            yield Request(url=url,callback=self.parse_tag_list)

    def parse_tag_list(self,response):
        #print('resp:',response.text)
        title = response.css('div.info h2 a::attr(title)').extract()
        #print('title:',title)
        #print str(title).decode('unicode_escape')

        urls = response.css('div.info h2 a::attr(href)').extract()
        for i in range(0,len(urls)):
            time.sleep(0.8)
            yield Request(url=urls[i],callback=self.parse_one_book)

    def parse_one_book(self,response):
        item =  DbbooksItem()
        title = response.css('div[id=wrapper] h1 span::text').extract()

        press = response.xpath('//div[@id="info"]/span[contains(text(),"出版社")]/following::text()[1]').extract()
        publish_date = response.xpath('//div[@id="info"]/span[contains(text(),"出版年")]/following::text()[1]').extract()
        price = response.xpath('//div[@id="info"]/span[contains(text(),"定价")]/following::text()[1]').extract()
        author = response.css('div[id=info] a::text').extract_first().strip()
        author = re.sub(r"\s{2,}",' ',author) #将2个以上的空格替换成1个
        star = response.css('strong::text').extract()
        comment_num = response.css('div.rating_sum span a span::text').extract()

        item['title'] = ''.join(title)
        item['link'] = response.url
        item['press'] = ''.join(press)
        item['publish_date'] = ''.join(publish_date)
        item['price'] = ''.join(price)
        item['author'] = author
        item['star'] = ''.join(star)
        item['comment_num'] = ''.join(comment_num)
        yield item

